Render this script with > sbatch ~/spinal_cord_paper/scripts/Seurat_Gg_NT_int_array.sh
the following workflow is based on: - [https://github.com/satijalab/seurat/issues/1500] Cell cycle regression - [https://github.com/satijalab/seurat/issues/1836] General workflow - [https://github.com/satijalab/seurat/issues/2590] Integrate all features (although used in the integration script)
library(Seurat)
library(dplyr)
library(Rtsne)
library(RColorBrewer)
library(tidyr)
library(ggplot2)
library(gridExtra)
library(patchwork)
library(cowplot)
library(ggdendro)
gnames <- modplots::gnames
# load the integrated seurat
my.int <- readRDS(file = \~/spinal_cord_paper/data/Gg_poly_integrated_060223.rds\)
my.int@project.name <- \Gg_poly_int\
my.int
Date: 07.03.23
We order the plot by nCount_RNA to prevent overplotting of the last sample (default order is the group.by ident).
# have a look at the PC and the previously calculated UMAP
PCAPlot(
my.int,
group.by = \orig.ident\,
order = \nCount_RNA\
)
DimPlot(
my.int,
reduction = \umap\,
group.by = \orig.ident\,
order = \nCount_RNA\
)
Now we can score the cell cycle stage, using Seurats function. For this wee need the ortholog of the distributed cell stage marker lists.
We do the CC.difference calculation at this stage - and not prior to integration - based on the suggestion from: [https://github.com/satijalab/seurat/issues/1500]
We add the scores of stages S and G2M, as well as the difference between them to the metadata with the names: S, G2M, CC.Difference
# Load the orhtology table
ortho_gg_mm_v102 <- readRDS(\~/spinal_cord_paper/data/ortho_gg_mm_v102.rds\)
colnames(ortho_gg_mm_v102) <- c(
\GG_gene_ID\,
\GG_gene_Name\,
\MM_gene_ID\,
\MM_gene_Name\,
\ortho_conf\,
\homolog_type\
)
# A list of cell cycle markers, from Tirosh et al, 2015, is loaded with Seurat. We can
# segregate this list into markers of G2/M phase and markers of S phase
s.genes <- ortho_gg_mm_v102 %>%
dplyr::mutate(MM_gene_Name = toupper(MM_gene_Name)) %>%
dplyr::filter(MM_gene_Name %in% cc.genes$s.genes) %>%
dplyr::arrange(match(MM_gene_Name, cc.genes$s.genes)) %>%
dplyr::pull(GG_gene_ID)
g2m.genes <- ortho_gg_mm_v102 %>%
dplyr::mutate(MM_gene_Name = toupper(MM_gene_Name)) %>%
dplyr::filter(MM_gene_Name %in% cc.genes$g2m.genes) %>%
dplyr::arrange(match(MM_gene_Name, cc.genes$g2m.genes)) %>%
dplyr::pull(GG_gene_ID)
t0 <- Sys.time()
my.int <- CellCycleScoring(
my.int,
s.features = s.genes,
g2m.features = g2m.genes,
set.ident = TRUE
)
paste0(\Seurats CC scoring took \, Sys.time() - t0, \ seconds to run.\)
my.int$CC.Difference.seurat <- my.int$S.Score - my.int$G2M.Score
# view cell cycle scores and phase assignments
head(my.int[[]])
rm(t0, s.genes, g2m.genes)
We rescale the data to regress out the CC.difference
all_genes <- rownames(my.int)
my.int <- ScaleData(
my.int,
features = all_genes,
vars.to.regress = \CC.Difference.seurat\
)
PCAPlot(my.int, group.by = \Phase\)
We will run the tSNE. Using the FFT-accelerated Interpolation-based t-SNE (FIt-SNE). We run two, a normal tSNE, and an exaggerated tSNE, where clusters are tighter togheter. This is located under the xtsne name.
# Get the tSNE function
source(\~/Software/FIt-SNE-master/fast_tsne.R\)
my.tsne = fftRtsne(my.int@reductions$pca@cell.embeddings[,my.dimensions],
max_iter = 1000,
learning_rate = round(dim(my.int)[2]/12),
initialization =(my.int@reductions$pca@cell.embeddings[,1:2]/my.int@reductions$pca@stdev[1])*0.0001,
perplexity_list = c(30, round(dim(my.int)[2]/100)),
fast_tsne_path=\~/Software/FIt-SNE-master/bin/fast_tsne\)
colnames(my.tsne) = c(\tsne_1\, \tsne_2\)
rownames(my.tsne) = colnames(my.int)
my.int[[\tsne\]] = CreateDimReducObject(embeddings = my.tsne, key = \tsne_\, assay = DefaultAssay(my.int), global = T)
Here we perform the Louvain-jaccard clustering implemented in Seurat.
We can see the tree of clusters, to see how the clusters relate in the
PCA space.
We also use the tree, to check if any of the terminal pairs of sisters
should be merged. This is determined based on a minimum of 20 DEs
between the clusters.
# Find first the nearest neighbors
my.int <- FindNeighbors(object = my.int, dims=my.dimensions, verbose = F)
# Then the actual clusters
my.int <- FindClusters(object = my.int, resolution = 1, verbose = F, random.seed = 42)
# Check the tree of clusters, to see what's the relationship between them
my.int <- BuildClusterTree(my.int, dims = my.dimensions, verbose = F)
plot(Tool(object = my.int, slot = 'BuildClusterTree'))
# We are gonna check for DEs using the non integrated data. We are only gonna test genes that have variability, so we calculate variable genes
my.int <- FindVariableFeatures(my.int, assay = \RNA\)
# Only the genes with variability > median
my.HVF <- HVFInfo(my.int, assay = \RNA\)
my.HVF <- rownames(my.HVF)[which(my.HVF[,3] > (median(my.HVF[,3])))]
# We check for pairs of clusters, how mayn DEs they have. If less than n, we merge them
keep.check <- T
while (keep.check == T) {
keep.check <- F
# Check the tree of clusters, to see what's the relationship between them
my.int <- BuildClusterTree(my.int, dims = my.dimensions, verbose = F)
# Check only the terminal sisters
to.check = ips::terminalSisters(my.int@tools$BuildClusterTree)
for (i in to.check) {
# DE between the sisters
my.DE <- FindMarkers(my.int, i[1], i[2], test.use = \MAST\, latent.vars = c(\CC.Difference.seurat\),
min.pct = 0.25, verbose = T, assay = \RNA\, features = my.HVF,) %>%
dplyr::filter(abs(avg_log2FC) > 0.5) %>%
dplyr::filter(p_val_adj < 0.05)
# If less than 5, merge, and repeat
if (dim(my.DE)[1] < 5) {
cat(paste0(dim(my.DE)[1], \ genes differentially expressed between clusters \,i[1],\ and \,i[2],\ merging \n\))
my.int <- SetIdent(my.int, cells = WhichCells(my.int, idents = i[2]), value = i[1])
keep.check <- T
}
print(i)
}
}
rm(to.check, my.HVF, my.DE)
# renumber starting from 1
my.ID <- factor(
Idents(my.int),
levels= levels(Idents(my.int))[base::order(as.numeric(levels(Idents(my.int))))])
levels(my.ID) <- seq(length(levels(my.ID)))
Idents(my.int) <- my.ID
my.int[[\seurat_clusters\]] <- my.ID
# Check again the clusters
dim1 <- DimPlot(my.int, reduction = \tsne\, cols = rainbow(length(levels(my.ID))), label = T, label.size = 5, pt.size = 1)
dim1
Plot the tree again
# tree based on PCA dims
my.int <- BuildClusterTree(my.int, dims = my.dimensions, verbose = F)
plot(Tool(object = my.int, slot = 'BuildClusterTree'))
Now that we have the final reductions, we’ll choose one and we look at the statistics of the cells.
# Run the actual PCA (by default uses var.features of assay)
my.int <- RunPCA(my.int, verbose = F)
PCAPlot(my.int, group.by = \Phase\)
# Which dimensions will we choose?
hist(my.int@reductions$pca@stdev^2, breaks = 500)
my.dimensions=1:20
# set and get dim.reduct embeddings
my.reduc <- \tsne\
emb <- data.frame(Embeddings(my.int, my.reduc))
colnames(emb) <- c(\reduc_1\, \reduc_2\)
meta <- my.int[[]] %>%
tibble::rownames_to_column(\cell_ID\)
my.plots = list()
my.plots[[1]] = ggplot(emb[meta[order(meta$nCount_RNA),]$cell_ID, ], aes(x = reduc_1, y = reduc_2)) +
geom_point(aes(color=sort(meta$nCount_RNA)), size=1, alpha = 0.4, pch = 19) +
scale_colour_gradientn(colours = c(\gray90\,\gray90\,\gray80\,\yellow\, \orange\, \red\, \darkred\, \darkred\)) +
theme_classic() + labs(colour=\UMI count\, x = paste0(my.reduc, \_1\), y = paste0(my.reduc, \_2\))
my.plots[[2]] = ggplot(emb[meta[order(meta$nFeature_RNA),]$cell_ID, ], aes(x = reduc_1, y = reduc_2)) +
geom_point(aes(color=sort(meta$nFeature_RNA)), size=1, alpha = 0.4, pch = 19) +
scale_colour_gradientn(colours = c(\gray90\,\gray90\,\gray80\,\yellow\, \orange\, \red\, \darkred\, \darkred\)) +
theme_classic() + labs(colour=\gene count\, x = paste0(my.reduc, \_1\), y = paste0(my.reduc, \_2\))
my.plots[[3]] = ggplot(emb[meta[order(meta$nCount_SCT),]$cell_ID, ], aes(x = reduc_1, y = reduc_2)) +
geom_point(aes(color=sort(meta$nCount_RNA)), size=1, alpha = 0.4, pch = 19) +
scale_colour_gradientn(colours = c(\gray90\,\gray90\,\gray80\,\yellow\, \orange\, \red\, \darkred\, \darkred\)) +
theme_classic() + labs(colour=\UMI count (SCT)\, x = paste0(my.reduc, \_1\), y = paste0(my.reduc, \_2\))
my.plots[[4]] = ggplot(emb[meta[order(meta$nFeature_SCT),]$cell_ID, ], aes(x = reduc_1, y = reduc_2)) +
geom_point(aes(color=sort(meta$nFeature_RNA)), size=1, alpha = 0.4, pch = 19) +
scale_colour_gradientn(colours = c(\gray90\,\gray90\,\gray80\,\yellow\, \orange\, \red\, \darkred\, \darkred\)) +
theme_classic() + labs(colour=\gene count (SCT)\, x = paste0(my.reduc, \_1\), y = paste0(my.reduc, \_2\))
my.plots[[5]] = ggplot(emb[meta[order(meta$percent.mt),]$cell_ID, ], aes(x = reduc_1, y = reduc_2)) +
geom_point(aes(color=(sort(meta$percent.mt))), size=1, alpha = 0.4, pch = 19) +
scale_colour_gradientn(colours = c(\gray90\,\gray80\,\yellow\, \orange\, \red\, \darkred\, \darkred\)) +
theme_classic() + labs(colour=\log1p MT percent\, x = paste0(my.reduc, \_1\), y = paste0(my.reduc, \_2\))
my.plots[[6]] = ggplot(emb[meta[order(meta$CC.Difference.seurat),]$cell_ID, ], aes(x = reduc_1, y = reduc_2)) +
geom_point(aes(color=sort(meta$CC.Difference.seurat)), size=1, alpha = 0.4, pch = 19) +
scale_colour_gradientn(colours = c(\gray90\,\gray90\,\gray80\,\yellow\, \orange\, \red\, \darkred\, \darkred\)) +
theme_classic() + labs(colour=\Cell Cycle\nS-G2M\, x = paste0(my.reduc, \_1\), y = paste0(my.reduc, \_2\))
my.plots[[7]] = ggplot(emb[meta[order(meta$percent.rb),]$cell_ID, ], aes(x = reduc_1, y = reduc_2)) +
geom_point(aes(color=sort(meta$percent.rb)), size=2, alpha = 0.4, pch = 19) +
scale_colour_gradientn(colours = c(\gray90\,\gray90\,\gray80\,\yellow\, \orange\, \red\, \darkred\, \darkred\)) +
theme_classic() + labs(colour=\percent.rb\)
grid.arrange(grobs=my.plots, ncol=2)
dim.orig.ident <- ggplot(emb, aes(x = reduc_1, y = reduc_2)) +
geom_point(aes(color=meta$orig.ident[sample(x = seq(nrow(emb)), size = nrow(emb), replace = FALSE)]), size=0.8, alpha = 0.6, pch = 19) +
scale_colour_manual(values = rainbow(length(table(my.int@meta.data$orig.ident))) ) +
theme_classic() + labs(colour=\Dataset\, x = paste0(my.reduc, \_1\), y = paste0(my.reduc, \_2\))
dim.orig.ident
To identify the different DV domains of the neuron and progenitor clusters, we plot their specific markers.
neurons <- list(dI1 = c(\LHX2\,\LHX9\,\BARHL1\,\BARHL2\,\POU4F1\),
dI2 = c(\LHX1\,\LHX5\,\POU4F1\),
dI3 = c(\ISL1\,\TLX3\,\DRGX\,\POU4F1\),
dI4 = c(\LBX1\,\PAX2\,\LHX1\,\LHX5\),
dI5 = c(\LBX1\,\LMX1B\,\TLX3\,\DRGX\,\POU4F1\),
dI6 = c(\LBX1\,\PAX2\,\LHX1\,\LHX5\),
V0 = c(\EVX1\,\PAX2\,\LHX1\,\LHX5\),
V1 = c(\EN1\,\PAX2\,\LHX1\,\LHX5\),
V2a = c(\VSX1\,\SOX14\,\LHX3\),
V2b = c(\GATA2\,\GATA3\,\TAL1\),
MN = c(\MNX1\,\ISL1\,\LHX3\,\ISL2\, \SLC18A3\))
prog <- list(dp1_3 = c(\PAX6\,\IRX3\,\IRX5\,\MSX1\,\PAX3\,\PAX7\),
dp4 = c(\PAX6\,\IRX3\,\IRX5\,\GSX1\,\PAX3\,\PAX7\),
dp5 = c(\PAX6\,\IRX3\,\IRX5\,\DBX2\,\GSX1\,\PAX3\,\PAX7\),
dp6 = c(\PAX6\,\IRX3\,\IRX5\,\DBX2\,\LEUTX\,\PAX3\,\PAX7\),
p0 = c(\PAX6\,\IRX3\,\IRX5\,\DBX2\,\LEUTX\),
p1 = c(\PAX6\,\IRX3\,\IRX5\,\DBX2\,\PRDM12\),
p2 = c(\PAX6\,\IRX3\,\IRX5\,\FOXN4\,\NKX6-1\),
pMN = c(\PAX6\,\OLIG2\,\NKX6-1\),
p3 = c(\NKX2-8\,\NKX2-2\,\NKX6-1\))
# allows to use mFeaturePlot with lapply
feat_list_plot <- function(x) {
plot <- modplots::mFeaturePlot(my.int, my.features = x,
gnames = gnames, size = 0.2, return = TRUE)
return(plot)
}
tsne_dim <- TSNEPlot(
my.int,
reduction = \tsne\,
cols = rainbow(length(levels(my.ID))),
pt.size = 0.01,
label.size = 1,
label = TRUE
) +
ggplot2::theme(legend.position = \none\)
plots_prog <- lapply(prog, feat_list_plot)
plots_neur <- lapply(neurons, feat_list_plot)
Not all features are present in your object! Removing: LHX2
Not all features are present in your object! Removing: LHX5
Not all features are present in your object! Removing: LBX1 LHX5
Not all features are present in your object! Removing: LBX1
Not all features are present in your object! Removing: LBX1 LHX5
Not all features are present in your object! Removing: LHX5
Not all features are present in your object! Removing: EN1 LHX5
Not all features are present in your object! Removing: ISL2
pdf(paste0(\~/spinal_cord_paper/figures/DV_prog_domain_\,my.int@project.name ,\.pdf\), width = 7, height = 5)
for (j in names(prog)) {
plots_prog[[j]][[\tsne\]] <- tsne_dim
gridExtra::grid.arrange(grobs = plots_prog[[j]], ncol = 3, top = j)
}
dev.off()
png
2
pdf(paste0(\~/spinal_cord_paper/figures/DV_neur_domain_\,my.int@project.name ,\.pdf\), width = 7, height = 5)
for (j in names(neurons)) {
plots_neur[[j]][[\tsne\]] <- tsne_dim
gridExtra::grid.arrange(grobs = plots_neur[[j]], ncol = 3, top = j)
}
dev.off()
png
2
Here we do the differential expression analysis, and end up with the marker genes lists. We can also see the marker gene dot plot, for the top 2 marker genes per cluster
# Find all the marker genes, with these thresholds MAST
semarkers = FindAllMarkers(my.int,
features = my.int[[\integrated\]]@var.features,
only.pos = TRUE,
min.pct = 0.25,
logfc.threshold = 0.5,
latent.vars = c(\CC.Difference.seurat\),
test.use = \MAST\,
assay = \RNA\,
return.thresh = 0.05)
# We only keep the significant ones
semarkers <- semarkers %>%
filter(p_val_adj < 0.05) %>%
rename(Gene.stable.ID = gene) %>%
left_join(gnames, by = \Gene.stable.ID\)
# Take only the top 10
semrk10 = semarkers %>% group_by(cluster) %>% top_n(-10, p_val_adj)
semrk1 = semarkers %>% group_by(cluster) %>% top_n(-1, p_val_adj)
modplots::mDotPlot2(my.int,
features = unique(semrk1$Gene.stable.ID),
cols = c(\grey\, \black\),
gnames = gnames, dot.scale = 6) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
DoHeatmap(my.int, semrk1$Gene.stable.ID)
saveRDS(my.int, paste0(\~/spinal_cord_paper/data/\, my.int@project.name, \_seurat_\,format(Sys.Date(), \%d%m%y\),\.rds\))
write.table(semarkers, sep = \\t\, row.names = T, col.names = T,
file = paste0(\~/spinal_cord_paper/data/\, my.int@project.name, \_fullDE_\,format(Sys.Date(), \%d%m%y\),\.txt\), quote = F)
write.table(semrk10, sep = \\t\, row.names = T, col.names = T,
file = paste0(\~/spinal_cord_paper/data/\, my.int@project.name, \_top10DE_\,format(Sys.Date(), \%d%m%y\),\.txt\), quote = F)
# Date and time of Rendering
Sys.time()
[1] \2023-02-07 11:11:02 CET\
sessionInfo()
R version 4.1.0 (2021-05-18)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: CentOS Linux 7 (Core)
Matrix products: default
BLAS/LAPACK: /scicore/soft/apps/OpenBLAS/0.3.1-GCC-7.3.0-2.30/lib/libopenblas_sandybridgep-r0.3.1.so
locale:
[1] en_US.UTF-8
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] ggdendro_0.1.22 cowplot_1.1.1 patchwork_1.1.1 gridExtra_2.3
[5] ggplot2_3.3.3 tidyr_1.1.3 RColorBrewer_1.1-2 Rtsne_0.15
[9] dplyr_1.0.10 SeuratObject_4.0.2 Seurat_4.0.5
loaded via a namespace (and not attached):
[1] fastmatch_1.1-0 plyr_1.8.6
[3] igraph_1.2.6 lazyeval_0.2.2
[5] sp_1.4-5 splines_4.1.0
[7] listenv_0.8.0 scattermore_0.7
[9] GenomeInfoDb_1.28.0 digest_0.6.27
[11] htmltools_0.5.1.1 ips_0.0.11
[13] fansi_0.5.0 magrittr_2.0.1
[15] memoise_2.0.0 tensor_1.5
[17] cluster_2.1.2 ROCR_1.0-11
[19] globals_0.16.2 Biostrings_2.60.0
[21] matrixStats_0.58.0 modplots_1.0.0
[23] spatstat.sparse_3.0-0 prettyunits_1.1.1
[25] colorspace_2.0-1 blob_1.2.1
[27] ggrepel_0.9.1 xfun_0.34
[29] crayon_1.4.1 RCurl_1.98-1.3
[31] jsonlite_1.7.2 spatstat.data_3.0-0
[33] phangorn_2.7.0 ape_5.5
[35] survival_3.2-11 zoo_1.8-9
[37] glue_1.6.2 polyclip_1.10-0
[39] gtable_0.3.0 zlibbioc_1.38.0
[41] XVector_0.32.0 leiden_0.3.9
[43] DelayedArray_0.18.0 SingleCellExperiment_1.14.1
[45] future.apply_1.7.0 BiocGenerics_0.38.0
[47] abind_1.4-5 scales_1.1.1
[49] pheatmap_1.0.12 DBI_1.1.1
[51] miniUI_0.1.1.1 Rcpp_1.0.7
[53] progress_1.2.2 viridisLite_0.4.0
[55] xtable_1.8-4 reticulate_1.22
[57] spatstat.core_2.1-2 bit_4.0.4
[59] stats4_4.1.0 htmlwidgets_1.5.3
[61] httr_1.4.2 ellipsis_0.3.2
[63] ica_1.0-2 XML_3.99-0.6
[65] farver_2.1.0 pkgconfig_2.0.3
[67] sass_0.4.0 uwot_0.1.10
[69] deldir_1.0-6 utf8_1.2.1
[71] labeling_0.4.2 tidyselect_1.2.0
[73] rlang_1.0.6 reshape2_1.4.4
[75] later_1.2.0 AnnotationDbi_1.54.0
[77] munsell_0.5.0 tools_4.1.0
[79] cachem_1.0.5 cli_3.4.1
[81] generics_0.1.3 RSQLite_2.2.7
[83] ggridges_0.5.3 org.Gg.eg.db_3.13.0
[85] evaluate_0.20 stringr_1.4.0
[87] fastmap_1.1.0 yaml_2.2.1
[89] goftest_1.2-2 knitr_1.41
[91] bit64_4.0.5 fitdistrplus_1.1-6
[93] purrr_0.3.4 RANN_2.6.1
[95] KEGGREST_1.32.0 pbapply_1.4-3
[97] future_1.30.0 nlme_3.1-152
[99] mime_0.10 compiler_4.1.0
[101] plotly_4.10.0 png_0.1-7
[103] spatstat.utils_3.0-1 tibble_3.1.8
[105] bslib_0.2.5.1 stringi_1.6.2
[107] highr_0.9 lattice_0.20-44
[109] Matrix_1.3-3 vctrs_0.5.1
[111] pillar_1.8.1 lifecycle_1.0.3
[113] spatstat.geom_3.0-3 lmtest_0.9-38
[115] jquerylib_0.1.4 RcppAnnoy_0.0.19
[117] data.table_1.14.0 bitops_1.0-7
[119] irlba_2.3.3 GenomicRanges_1.44.0
[121] httpuv_1.6.1 R6_2.5.0
[123] promises_1.2.0.1 KernSmooth_2.23-20
[125] IRanges_2.26.0 parallelly_1.33.0
[127] codetools_0.2-18 MASS_7.3-54
[129] assertthat_0.2.1 MAST_1.18.0
[131] SummarizedExperiment_1.22.0 withr_2.4.2
[133] sctransform_0.3.3 S4Vectors_0.30.0
[135] GenomeInfoDbData_1.2.6 hms_1.1.0
[137] mgcv_1.8-35 parallel_4.1.0
[139] quadprog_1.5-8 grid_4.1.0
[141] rpart_4.1-15 rmarkdown_2.17
[143] MatrixGenerics_1.4.0 Biobase_2.52.0
[145] shiny_1.6.0